1 Creating an R object from Spectronaut output files

[Skip for now]

filetable <-  read.table(file="/Users/shubhamagrawal/Documents/work/compressed/DIA/fileTable.txt", header = TRUE)

mae <- readExperimentDIA(fileTable = filetable, 
                         annotation_col = c("treatment", "timepoint", "replicate",
                                            "sampleType"))

2 Use the created object

mae <- readRDS("data/maeObjNEW.Rds")
mae
## A MultiAssayExperiment object of 2 listed
##  experiments with user-defined names and respective classes.
##  Containing an ExperimentList class object of length 2:
##  [1] Phosphoproteome: SummarizedExperiment with 40517 rows and 138 columns
##  [2] Proteome: SummarizedExperiment with 8518 rows and 79 columns
## Functionality:
##  experiments() - obtain the ExperimentList instance
##  colData() - the primary/phenotype DataFrame
##  sampleMap() - the sample coordination DataFrame
##  `$`, `[`, `[[` - extract colData columns, subset, or experiment
##  *Format() - convert into a long or wide DataFrame
##  assays() - convert ExperimentList to a SimpleList of matrices
##  exportClass() - save data to flat files

3 Preprocessing the assay, basic visualization, PCA

se <- mae[["Phosphoproteome"]]
colData(se) <- colData(mae[, colnames(se)])
se
## class: SummarizedExperiment 
## dim: 40517 138 
## metadata(0):
## assays(1): Intensity
## rownames(40517): s1 s2 ... s40516 s40517
## rowData names(6): UniprotID Gene ... Sequence site
## colnames(138): FullProteome_1stCtrl_0min_rep2
##   FullProteome_1stCtrl_0min_rep3 ... Phospho_HGF_40min_rep1
##   Phospho_HGF_6h_rep1
## colData names(6): treatment timepoint ... sample sampleName
plotIntensity(se[, se$sampleType == "Phospho"], color = "replicate") + theme_classic() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0, size = 7),
    plot.title = element_text(hjust = 0.5, face = "bold")
  ) 

newSE <- preprocessPhos(seData = se, transform = "log2", 
                        normalize = TRUE, impute = "QRILC")
## Warning in fun(libname, pkgname): mzR has been built against a different Rcpp version (1.0.12)
## than is installed on your system (1.1.0). This might lead to errors
## when loading mzR. If you encounter such issues, please send a report,
## including the output of sessionInfo() to the Bioc support forum at 
## https://support.bioconductor.org/. For details see also
## https://github.com/sneumann/mzR/wiki/mzR-Rcpp-compiler-linker-issue.
## Imputing along margin 2 (samples/columns).
newSE
## class: SummarizedExperiment 
## dim: 13081 59 
## metadata(0):
## assays(2): Intensity imputed
## rownames(13081): s1 s4 ... s40511 s40514
## rowData names(6): UniprotID Gene ... Sequence site
## colnames(59): Phospho_1stCtrl_0min_rep2 Phospho_1stCtrl_0min_rep3 ...
##   Phospho_HGF_40min_rep1 Phospho_HGF_6h_rep1
## colData names(6): treatment timepoint ... sample sampleName
plotIntensity(newSE, color = "replicate") + theme_classic() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0, size = 7),
    plot.title = element_text(hjust = 0.5, face = "bold")
  ) 

plotMissing(newSE) + theme_classic() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0, size = 7),
    plot.title = element_text(hjust = 0.5, face = "bold")
  ) 

4 Perform PCA

pca <- stats::prcomp(t(assays(newSE)[["imputed"]]), center = TRUE, scale. = TRUE)
p <- plotPCA(pca = pca, se = newSE,
             color = "treatment",
             shape = "replicate")

p$layers[[1]]$aes_params$size   <- 3
p$layers[[1]]$aes_params$stroke <- 1.2

p + coord_equal() + theme(aspect.ratio = 1, legend.position = "right",
                          axis.text = element_text(color = "#000000")) +
  guides(
    shape = guide_legend(order = 1),
    color = guide_legend(order = 2)
    )

5 rep1 samples need to be removed or perform batch correction

newSE <- preprocessPhos(seData = se, transform = "log2", 
                        normalize = TRUE, impute = "QRILC", 
                        removeOutlier = "rep1")
## Imputing along margin 2 (samples/columns).
newSE
## class: SummarizedExperiment 
## dim: 17035 32 
## metadata(0):
## assays(2): Intensity imputed
## rownames(17035): s1 s4 ... s40513 s40514
## rowData names(6): UniprotID Gene ... Sequence site
## colnames(32): Phospho_1stCtrl_0min_rep2 Phospho_1stCtrl_0min_rep3 ...
##   Phospho_HGF_40min_rep2 Phospho_HGF_40min_rep3
## colData names(6): treatment timepoint ... sample sampleName
pca <- stats::prcomp(t(assays(newSE)[["imputed"]]), 
                     center = TRUE, scale. = TRUE)
p <- plotPCA(pca = pca, se = newSE,
             color = "treatment",
             shape = "replicate")

p$layers[[1]]$aes_params$size   <- 3
p$layers[[1]]$aes_params$stroke <- 1.2

p + coord_equal() + theme(aspect.ratio = 1, legend.position = "right",
                          axis.text = element_text(color = "#000000")) +
  guides(
    shape = guide_legend(order = 1),
    color = guide_legend(order = 2)
    )

p <- plotHeatmap(type = "Top variant", newSE, top = 30, annotationCol = c("replicate", "treatment"))

g <- p$gtable

# Find row names grob
row_names <- which(g$layout$name == "row_names")
g$grobs[[row_names]]$gp <- gpar(fontsize = 8, fontface = "bold")
# Same for columns
col_names <- which(g$layout$name == "col_names")
g$grobs[[col_names]]$gp <- gpar(fontsize = 10)

grid.newpage()
grid.draw(g)

6 Differential expression

dea <- performDifferentialExp(se = newSE, 
                              assay = "imputed", 
                              method = "limma", 
                              condition = "treatment", 
                              reference = "1stCtrl", 
                              target = "EGF")
p <- ggplot(dea$resDE, aes(x = pvalue)) +
  geom_histogram(fill = "grey", col = "blue", alpha=0.7) +
  ggtitle("P value histogram") + theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 
  
p
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

pFilter = 0.01
plotVolcano(dea$resDE, pFilter = 0.01, fcFilter = 1) + theme_classic() +
  geom_hline(yintercept = -log10(as.numeric(pFilter)), color = "brown",
               linetype = "dashed") +
    annotate(x = 3.0, y = -log10(as.numeric(pFilter)) - 0.20,
             label = paste0("P-value = ", as.numeric(pFilter)),
             geom = "text", size = 3.5, color = "brown") 

plotVolcanoDEA(dea$resDE, fcFilter = 1, pFilter = 0.01, usePadj = FALSE)

dea$resDE
## # A tibble: 17,035 × 11
##    ID    log2FC  stat   pvalue    padj UniprotID Gene  Position Residue Sequence
##    <I<c>  <dbl> <dbl>    <dbl>   <dbl> <chr>     <chr> <chr>    <chr>   <chr>   
##  1 s320…   8.05  30.2 6.35e-12 1.08e-7 Q9BZQ8    NIBA… 579      S       NMALPSE…
##  2 s5284   2.32  27.6 1.64e-11 1.40e-7 P04920    SLC4… 243      S       QERRRIG…
##  3 s176…  -3.54 -25.0 4.84e-11 2.75e-7 Q3MIN7    RGL3  573      S       PAGSPPA…
##  4 s129…   3.21  23.1 1.17e-10 4.16e-7 Q07889    SOS1  1134     S       PHGPRSA…
##  5 s135…   4.35  23.0 1.22e-10 4.16e-7 Q12802    AKAP… 1731     S       NYNFLPH…
##  6 s3876   4.13  21.7 2.30e-10 6.53e-7 O94763    URI1  492      T       EFVSPSL…
##  7 s4880   7.98  21.0 3.25e-10 7.90e-7 O95870    ABHD… 32       T       APASVPE…
##  8 s366…   9.49  20.1 5.08e-10 1.08e-6 Q9UGH3    SLC2… 79       T       LAETLDS…
##  9 s252…   6.81  18.8 1.07e- 9 2.03e-6 Q8N6T3    ARFG… 304      S       KAEGPLD…
## 10 s222…   5.17  17.9 1.79e- 9 3.05e-6 Q7Z3B3    KANS… 1082     S       ETEAAPT…
## # ℹ 17,025 more rows
## # ℹ 1 more variable: site <chr>
intensityBoxPlot(se = dea$seSub, id = 's4971', symbol = "EGFR_S991")

7 Time series clustering

set.seed(12345)
library(matrixStats)

# newSEts <- addZeroTime(newSE, condition = "treatment", treat = "EGF",
#                        zeroTreat = "1stCtrl",
#                        timeRange = c("20min","40min","100min"))

newSEts <- newSE[ , newSE$treatment == "EGF"]
assayMat <- SummarizedExperiment::assay(newSEts)

exprMat <- lapply(unique(newSEts$timepoint), function(tp) {
  rowMedians(assayMat[,newSEts$timepoint == tp])
  }) %>% bind_cols() %>% as.matrix()
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
rownames(exprMat) <- rownames(newSEts)
colnames(exprMat) <- unique(newSEts$timepoint)

sds <- apply(exprMat,1,sd)
varPer <- 80
exprMat <- exprMat[order(sds, decreasing = TRUE)[seq(1, varPer/100*nrow(exprMat))], ]
# only center when it's for expression
exprMat <- mscale(exprMat)
# remove NA values
exprMat <- exprMat[complete.cases(exprMat), ]


tsc <- clusterTS(x = exprMat, k = 5, pCut = 0.6)
tsc$plot + theme(
  axis.text.x = element_text(size = 10), axis.text = element_text(color = "#000000"), ) 

8 Enrichment analysis

genesetPath <- system.file("shiny-app/geneset", package = "SmartPhos")
inGMT1 <- piano::loadGSC(paste0(genesetPath, "/Cancer_Hallmark.gmt"), 
                         type="gmt")
resTab <- enrichDifferential(dea = dea$resDE, type = "Pathway enrichment", 
                             gsaMethod = "PAGE", geneSet = inGMT1, 
                             statType = "stat", nPerm = 200, sigLevel = 0.05, 
                             ifFDR = FALSE)
## Running gene set analysis:
## Checking arguments...done!
## Final gene/gene-set association: 1218 genes and 50 gene sets
##   Details:
##   Calculating gene set statistics from 1218 out of 4183 gene-level statistics
##   Removed 3168 genes from GSC due to lack of matching gene statistics
##   Removed 0 gene sets containing no genes after gene removal
##   Removed additionally 0 gene sets not matching the size limits
##   Loaded additional information for 50 gene sets
## Calculating gene set statistics...done!
## Calculating gene set significance...done!
## Adjusting for multiple testing...done!
resTab
##                                 Name Gene Number    Stat       p.up p.up.adj
## 1           HALLMARK_MITOTIC_SPINDLE         145  3.3173 0.00045449 0.022724
## 2        HALLMARK_HEDGEHOG_SIGNALING          10 -1.7673 0.96141000 0.994220
## 3 HALLMARK_OXIDATIVE_PHOSPHORYLATION          23 -1.8930 0.97082000 0.994220
## 4            HALLMARK_MYC_TARGETS_V1         112 -2.0049 0.97751000 0.994220
## 5         HALLMARK_KRAS_SIGNALING_DN          14 -2.5253 0.99422000 0.994220
##      p.down p.down.adj Number up Number down
## 1 0.9995500    0.99955        92          53
## 2 0.0385880    0.48235         3           7
## 3 0.0291820    0.48235         9          14
## 4 0.0224850    0.48235        57          55
## 5 0.0057807    0.28904         4          10
inGMT2 <- piano::loadGSC(paste0(genesetPath, "/KEGG_pathways.gmt"), 
                         type="gmt")
clustEnr <- clusterEnrich(clusterTab = tsc$cluster, se = newSE, 
                          inputSet = inGMT2, filterP = 0.01, ifFDR = FALSE)
clusterEnrichExp <- function(clusterTab, se, inputSet, reference = NULL,
                          ptm = FALSE, adj = "BH", filterP = 0.05,
                          ifFDR = FALSE) {

  cluster <- padj <- ifSig <- pval <- Name <- atLeast1 <- NULL
  # If reference is not provided, derive it from the SummarizedExperiment object
  if (is.null(reference)) {
    if (ptm) {
      reference <- rowData(se)$site
    }
    else {
      reference <- unique(rowData(se)$Gene)
    }
  }

  # Perform Fisher's Exact Test for each unique cluster
  resTabFisher <- lapply(unique(clusterTab$cluster), function(cc) {
    # Extract gene IDs for the current cluster
    id <- filter(clusterTab, cluster == cc)$feature
    if (ptm) {
      genes <- unique(elementMetadata(se)[id,]$site)
    }
    else {
      genes <- unique(elementMetadata(se)[id,]$Gene)
    }

    # Run Fisher's Exact Test and annotate results with the cluster ID
    eachOut <- runFisher(genes, reference, inputSet, ptm) %>%
      mutate(cluster = cc)
  }) %>% bind_rows()

  # Filter results based on significance and prepare for plotting
  if (ifFDR) {
    plotTab <- resTabFisher %>%
      filter(padj <= filterP) %>% arrange(padj) %>%
      mutate(ifSig = padj <= filterP) %>%
      group_by(Name) %>% mutate(atLeast1 = sum(ifSig)>0) %>%
      filter(atLeast1) %>% ungroup()
  }
  else {
    plotTab <- resTabFisher %>%
      filter(pval <= filterP) %>% arrange(pval) %>%
      mutate(ifSig = pval <= filterP) %>%
      group_by(Name) %>% mutate(atLeast1 = sum(ifSig)>0) %>%
      filter(atLeast1) %>% ungroup()
  }
  
  #print(plotTab)

  # Create a ggplot object for visualization of enrichment results
  p <- ggplot(plotTab,
              aes(x=cluster, y=Name, customdata = cluster, key = Name)) +
    geom_point(aes(size = Gene.number,fill=-log10(pval)), shape = 21,
               color = "black") +
    scale_fill_gradient(low = "white", high = "red") +
    xlab("Cluster") +
    ylab("Pathway") +
    labs(size="Number of Genes") +
    theme(axis.text.x = element_text(size = 12),
          axis.text.y = element_text(size = 12),
          plot.title = element_text(hjust = 0.5, face = "bold"))
  return(list(table = plotTab, plot = p))
}
clustEnrX <- clusterEnrichExp(clusterTab = tsc$cluster, se = newSE, 
                          inputSet = inGMT2, filterP = 0.01, ifFDR = FALSE)
clustEnrX$plot + theme_classic()

# download the list
df <- as.data.frame(clustEnr$table)
df2 <- apply(df,2,as.character)
write.csv(df2, "ClusterEnrichmentListEGF.csv")

9 Kinase activity inference

First on the differential expression analysis results

netw <- getDecouplerNetwork("Homo sapiens")
scoreTab <- calcKinaseScore(dea$resDE, netw, statType = "stat", nPerm = 100)
plotKinaseDE(scoreTab, nTop = 10, pCut = 0.05)

clusterData <- tsc$cluster[tsc$cluster$cluster == "cluster1",]
allClusterFeature <- clusterData %>%
  distinct(feature, .keep_all = TRUE) %>% .$feature
allClusterSite <- data.frame(rowData(newSEts)[allClusterFeature, "site"])
allClusterSite$feature <- allClusterFeature
clusterData <- clusterData %>%
  left_join(allClusterSite, by = "feature") %>%
  rename(site = "rowData.newSEts..allClusterFeature...site..")
scoreTab <- calcKinaseScore(clusterData, netw, statType = "stat", nPerm = 100)
plotKinaseTimeSeries(scoreTab, pCut = 0.05, clusterName = "cluster1")

10 Phosphosites pattern

newSEts <- addZeroTime(newSE, condition = "treatment", treat = "EGF",
                       zeroTreat = "1stCtrl",
                       timeRange = c("20min","40min","100min"))

rd <- as.data.frame(rowData(newSEts))
timerange <- unique(newSEts$timepoint)
geneList <-  c("MET", "Shc1", "AKT1", "AKT2", "AKT3", "RPS6KA1", "RPS6KA3", "RPS6KA4", "RPS6KB1", "EGFR", "SOS1", "MAP2K2", "MAPK3", "MAPK1", "ERBB2","GAB1","RPS6", "RAF1", "RPS6", "TGFBR2", "AXL", "MTOR")

for (i in geneList) {
  # condition to check if that site is present in row data
  if (i %in% rd$Gene) {
    sites <- rd[rd$Gene==i,]$site
    for (j in sites) {
      # condition to check if assay doesn't have NAs (imputed assays are not allowed)
      if (!is.na(assay(newSEts)[rownames(rd[rd$site==j,]), ][1])) {
        p <- plotTimeSeries(newSEts, type = "expression", 
                          geneID = rownames(rd[rd$site==j,]), 
                          symbol = j, condition = "treatment", 
                          treatment = "EGF", timerange = timerange) +
          theme(axis.text = element_text(color = "#000000", size = 20), 
                axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5, size = 20),
                axis.title = element_text(size=20),
                plot.title = element_text(size=30)) + 
          scale_x_continuous(breaks = c(0,20,40,60,80,100))
        p$layers[[1]]$aes_params$size <- 6
        p$layers[[1]]$mapping$colour <- NULL
        p$layers[[1]]$aes_params$colour <- "#c1191f"
        p$layers[[2]]$aes_params$linewidth <- 1
        p$layers[[2]]$mapping$colour <- NULL
        p$layers[[2]]$aes_params$colour <- "#555555"
        print(p)
      }
    }
  }
}